In [1]:
Copied!
import pathpyG as pp
import torch
from pathpyG.nn.dbgnn import DBGNN
from torch_geometric.transforms import RandomNodeSplit
import torch_geometric
from sklearn.manifold import TSNE
import numpy as np
import matplotlib.pyplot as plt
pp.config['torch']['device'] = 'cpu'
device = pp.config['torch']['device']
import pathpyG as pp
import torch
from pathpyG.nn.dbgnn import DBGNN
from torch_geometric.transforms import RandomNodeSplit
import torch_geometric
from sklearn.manifold import TSNE
import numpy as np
import matplotlib.pyplot as plt
pp.config['torch']['device'] = 'cpu'
device = pp.config['torch']['device']
Load the synthetic dataset¶
In [2]:
Copied!
# Read temporal network
t = pp.TemporalGraph.from_csv('../data/temporal_clusters.tedges')
# Read temporal network
t = pp.TemporalGraph.from_csv('../data/temporal_clusters.tedges')
In [3]:
Copied!
node_colors = ['green']*10+['red']*10+['blue']*10
node_colors = ['green']*10+['red']*10+['blue']*10
In [4]:
Copied!
style = {}
style['node_color'] = node_colors
style = {}
style['node_color'] = node_colors
In [5]:
Copied!
pp.plot(t, **style);
pp.plot(t, **style);
In [6]:
Copied!
# read the paths
paths_original = pp.PathData.from_csv('../data/temporal_clusters.ngram')
print(paths_original)
print(paths_original.num_nodes)
# read the paths
paths_original = pp.PathData.from_csv('../data/temporal_clusters.ngram')
print(paths_original)
print(paths_original.num_nodes)
PathData with 7460 walks and 0 dags and total weight 29042 30
In [7]:
Copied!
#caluclate paths
dag = pp.algorithms.temporal_graph_to_event_dag(t, delta=1)
print(dag)
#caluclate paths
dag = pp.algorithms.temporal_graph_to_event_dag(t, delta=1)
print(dag)
Graph with 89032 nodes and 60000 edges Node attributes node_id <class 'list'> node_name <class 'list'> node_idx <class 'list'> Edge attributes edge_ts <class 'torch.Tensor'> -> torch.Size([60000]) Graph attributes num_nodes <class 'int'>
In [8]:
Copied!
paths = pp.PathData.from_temporal_dag(dag)
print(paths)
paths = pp.PathData.from_temporal_dag(dag)
print(paths)
PathData with 29032 walks and 0 dags and total weight 29032
In [11]:
Copied!
print(len(set([tuple(pp.PathData.walk_to_node_seq(v).tolist()) for v in paths.paths.values()])))
print(len(set([tuple(pp.PathData.walk_to_node_seq(v).tolist()) for v in paths.paths.values()])))
7078
In [9]:
Copied!
# Create the graph corresponding to paths
g = pp.HigherOrderGraph(paths, order=1)
# Plotting the time-aggregated network (first-order graph)
pp.plot(g);
# Create the graph corresponding to paths
g = pp.HigherOrderGraph(paths, order=1)
# Plotting the time-aggregated network (first-order graph)
pp.plot(g);
In [10]:
Copied!
# Create the second-order graph corresponding to paths
g2 = pp.HigherOrderGraph(paths, order=2)
# Plotting the second-order graph
pp.plot(g2);
# Create the second-order graph corresponding to paths
g2 = pp.HigherOrderGraph(paths, order=2)
# Plotting the second-order graph
pp.plot(g2);
In [15]:
Copied!
t_shuffled = pp.TemporalGraph.from_csv('../data/temporal_clusters.tedges')
t_shuffled.data['t'] = t.data['t'][torch.randperm(len(t_shuffled.data['t']))]
t_shuffled = pp.TemporalGraph.from_csv('../data/temporal_clusters.tedges')
t_shuffled.data['t'] = t.data['t'][torch.randperm(len(t_shuffled.data['t']))]
In [16]:
Copied!
t.data.t
t.data.t
Out[16]:
tensor([ 0, 1, 2, ..., 59997, 59998, 59999])
In [18]:
Copied!
t_shuffled.data.t
t_shuffled.data.t
Out[18]:
tensor([30587, 15778, 16467, ..., 11520, 29890, 56463])
In [25]:
Copied!
#caluclate paths
dag_shuffled = pp.algorithms.temporal_graph_to_event_dag(t_shuffled, delta=1)
print(dag_shuffled)
#caluclate paths
dag_shuffled = pp.algorithms.temporal_graph_to_event_dag(t_shuffled, delta=1)
print(dag_shuffled)
Graph with 118038 nodes and 60000 edges Node attributes node_id <class 'list'> node_idx <class 'list'> node_name <class 'list'> Edge attributes edge_ts <class 'torch.Tensor'> -> torch.Size([60000]) Graph attributes num_nodes <class 'int'>
In [26]:
Copied!
paths_shuffled = pp.PathData.from_temporal_dag(dag_shuffled)
print(paths_shuffled)
print(paths_shuffled.num_nodes)
paths_shuffled = pp.PathData.from_temporal_dag(dag_shuffled)
print(paths_shuffled)
print(paths_shuffled.num_nodes)
PathData with 58038 walks and 0 dags and total weight 58038 30
In [29]:
Copied!
# Create the second-order graph corresponding to paths
g2_shuffled = pp.HigherOrderGraph(paths_shuffled, order=2)
print(g2_shuffled)
# Plotting the second-order graph
pp.plot(g2_shuffled);
# Create the second-order graph corresponding to paths
g2_shuffled = pp.HigherOrderGraph(paths_shuffled, order=2)
print(g2_shuffled)
# Plotting the second-order graph
pp.plot(g2_shuffled);
HigherOrderGraph (k=2) with 849 nodes and 1871 edges Total edge weight = 1962.0 Edge attributes edge_weight <class 'torch.Tensor'> -> torch.Size([1871]) Graph attributes node_id <class 'list'> num_nodes <class 'int'>
Prepare the data¶
In [ ]:
Copied!
# Define edge indices for first and second-order graphs
edge_index_g1 = g.data.edge_index
edge_index_g2 = g2.data.edge_index
# Define edge indices for first and second-order graphs
edge_index_g1 = g.data.edge_index
edge_index_g2 = g2.data.edge_index
In [ ]:
Copied!
# Define edge weights
edge_weights = g.data['edge_weight']
edge_weights_higher_order = g2.data['edge_weight']
# Define edge weights
edge_weights = g.data['edge_weight']
edge_weights_higher_order = g2.data['edge_weight']
In [ ]:
Copied!
# Define bipartite mapping
import torch
def generate_bipatite_edge_index(mapping = 'last'):
if mapping == 'last':
bipartide_edge_index = torch.tensor([list(g2.node_index_to_id.keys()),
[i[1] for i in g2.node_index_to_id.values()]])
elif mapping == 'first':
bipartide_edge_index = torch.tensor([list(g2.node_index_to_id.keys()),
[i[0] for i in g2.node_index_to_id.values()]])
else:
bipartide_edge_index = torch.tensor([list(g2.node_index_to_id.keys()) + list(g2.node_index_to_id.keys()),
[i[0] for i in g2.node_index_to_id.values()] + [i[1] for i in g2.node_index_to_id.values()]])
return bipartide_edge_index
# Original DBGNN implementation mapping = 'last'
bipatite_edge_index = generate_bipatite_edge_index(mapping='last')
# Define bipartite mapping
import torch
def generate_bipatite_edge_index(mapping = 'last'):
if mapping == 'last':
bipartide_edge_index = torch.tensor([list(g2.node_index_to_id.keys()),
[i[1] for i in g2.node_index_to_id.values()]])
elif mapping == 'first':
bipartide_edge_index = torch.tensor([list(g2.node_index_to_id.keys()),
[i[0] for i in g2.node_index_to_id.values()]])
else:
bipartide_edge_index = torch.tensor([list(g2.node_index_to_id.keys()) + list(g2.node_index_to_id.keys()),
[i[0] for i in g2.node_index_to_id.values()] + [i[1] for i in g2.node_index_to_id.values()]])
return bipartide_edge_index
# Original DBGNN implementation mapping = 'last'
bipatite_edge_index = generate_bipatite_edge_index(mapping='last')
In [ ]:
Copied!
# Define the PyG data object
from torch_geometric.data import Data
num_nodes = max(max(g.data['edge_index'][0]), max(g.data['edge_index'][1])).item() + 1 # since indexing starts from 0
num_ho_nodes = max(max(g2.data['edge_index'][0]), max(g2.data['edge_index'][1])).item() + 1 # since indexing starts from 0
data = Data(
num_nodes = num_nodes,
num_ho_nodes = num_ho_nodes,
x = torch.eye(num_nodes, num_nodes),
x_h = torch.eye(num_ho_nodes, num_ho_nodes),
edge_index = edge_index_g1,
edge_index_higher_order = edge_index_g2,
edge_weights = edge_weights.float(),
edge_weights_higher_order = edge_weights_higher_order.float(),
bipartite_edge_index = bipatite_edge_index,
y = torch.tensor([ int(i) // 10 for i in paths.node_id])
)
# Define the PyG data object
from torch_geometric.data import Data
num_nodes = max(max(g.data['edge_index'][0]), max(g.data['edge_index'][1])).item() + 1 # since indexing starts from 0
num_ho_nodes = max(max(g2.data['edge_index'][0]), max(g2.data['edge_index'][1])).item() + 1 # since indexing starts from 0
data = Data(
num_nodes = num_nodes,
num_ho_nodes = num_ho_nodes,
x = torch.eye(num_nodes, num_nodes),
x_h = torch.eye(num_ho_nodes, num_ho_nodes),
edge_index = edge_index_g1,
edge_index_higher_order = edge_index_g2,
edge_weights = edge_weights.float(),
edge_weights_higher_order = edge_weights_higher_order.float(),
bipartite_edge_index = bipatite_edge_index,
y = torch.tensor([ int(i) // 10 for i in paths.node_id])
)
DBGNN¶
In [ ]:
Copied!
from sklearn.metrics import balanced_accuracy_score
def test(model, data):
model.eval()
_, pred = model(data).max(dim=1)
metrics_train = balanced_accuracy_score(
data.y[data.train_mask].cpu(),
pred[data.train_mask].cpu().numpy()
)
metrics_test = balanced_accuracy_score(
data.y[data.test_mask].cpu(),
pred[data.test_mask].cpu().numpy()
)
return metrics_train, metrics_test
from sklearn.metrics import balanced_accuracy_score
def test(model, data):
model.eval()
_, pred = model(data).max(dim=1)
metrics_train = balanced_accuracy_score(
data.y[data.train_mask].cpu(),
pred[data.train_mask].cpu().numpy()
)
metrics_test = balanced_accuracy_score(
data.y[data.test_mask].cpu(),
pred[data.test_mask].cpu().numpy()
)
return metrics_train, metrics_test
In [ ]:
Copied!
data = RandomNodeSplit(num_val=0, num_test=0.3)(data)
model = DBGNN(
num_features =[num_nodes, num_ho_nodes],
num_classes = len(data.y.unique()),
hidden_dims = [16, 32, 8],
p_dropout = 0.4
).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)
loss_function = torch.nn.CrossEntropyLoss()
data = data.to(device)
data = RandomNodeSplit(num_val=0, num_test=0.3)(data)
model = DBGNN(
num_features =[num_nodes, num_ho_nodes],
num_classes = len(data.y.unique()),
hidden_dims = [16, 32, 8],
p_dropout = 0.4
).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)
loss_function = torch.nn.CrossEntropyLoss()
data = data.to(device)
In [ ]:
Copied!
data
data
In [ ]:
Copied!
print(model)
print(model)
In [ ]:
Copied!
losses = []
for epoch in range(1000):
output = model(data)
loss = loss_function(output[data.train_mask], data.y[data.train_mask])
loss.backward()
optimizer.step()
optimizer.zero_grad()
losses.append(loss)
if epoch % 10 == 0:
train_ba, test_ba = test(model, data)
print(f'Epoch: {epoch}, Loss: {loss}, Train balanced accuracy: {train_ba}, Test balanced accuracy: {test_ba}')
losses = []
for epoch in range(1000):
output = model(data)
loss = loss_function(output[data.train_mask], data.y[data.train_mask])
loss.backward()
optimizer.step()
optimizer.zero_grad()
losses.append(loss)
if epoch % 10 == 0:
train_ba, test_ba = test(model, data)
print(f'Epoch: {epoch}, Loss: {loss}, Train balanced accuracy: {train_ba}, Test balanced accuracy: {test_ba}')
Latent space representation of edges¶
In [ ]:
Copied!
g2.node_index_to_id[0]
g2.node_index_to_id[0]
In [ ]:
Copied!
model.eval()
latent = model.higher_order_layers[0].forward(data.x_h, data.edge_index_higher_order).detach()
node_embedding = TSNE(n_components=2, learning_rate='auto', init='random').fit_transform(latent.cpu())
colors = []
for v, w in g2.nodes:
if data.y[v] == 0 and data.y[w] == 0:
colors.append('red')
elif data.y[v] == 1 and data.y[w] == 1:
colors.append('green')
elif data.y[v] == 2 and data.y[w] == 2:
colors.append('blue')
else:
colors.append('grey')
plt.figure(figsize=(13,10))
plt.scatter(node_embedding[:,0], node_embedding[:,1], c=colors, alpha=0.5)
for e in g2.edges:
s = g2.node_id_to_index[e[0]]
t = g2.node_id_to_index[e[1]]
plt.plot([node_embedding[s,0], node_embedding[t,0]], [node_embedding[s,1], node_embedding[t,1]],
color='lightsteelblue',
linestyle='-',
alpha=0.2,
lw=0.2)
plt.axis('off')
plt.show()
model.eval()
latent = model.higher_order_layers[0].forward(data.x_h, data.edge_index_higher_order).detach()
node_embedding = TSNE(n_components=2, learning_rate='auto', init='random').fit_transform(latent.cpu())
colors = []
for v, w in g2.nodes:
if data.y[v] == 0 and data.y[w] == 0:
colors.append('red')
elif data.y[v] == 1 and data.y[w] == 1:
colors.append('green')
elif data.y[v] == 2 and data.y[w] == 2:
colors.append('blue')
else:
colors.append('grey')
plt.figure(figsize=(13,10))
plt.scatter(node_embedding[:,0], node_embedding[:,1], c=colors, alpha=0.5)
for e in g2.edges:
s = g2.node_id_to_index[e[0]]
t = g2.node_id_to_index[e[1]]
plt.plot([node_embedding[s,0], node_embedding[t,0]], [node_embedding[s,1], node_embedding[t,1]],
color='lightsteelblue',
linestyle='-',
alpha=0.2,
lw=0.2)
plt.axis('off')
plt.show()
In [ ]:
Copied!
model.eval()
latent = model.higher_order_layers[1].forward(latent.cpu(), data.edge_index_higher_order).detach()
node_embedding = TSNE(n_components=2, learning_rate='auto', init='random').fit_transform(latent.cpu())
colors = []
for v, w in g2.nodes:
if data.y[v] == 0 and data.y[w] == 0:
colors.append('red')
elif data.y[v] == 1 and data.y[w] == 1:
colors.append('green')
elif data.y[v] == 2 and data.y[w] == 2:
colors.append('blue')
else:
colors.append('grey')
plt.figure(figsize=(13,10))
plt.scatter(node_embedding[:,0], node_embedding[:,1], c=colors, alpha=0.5)
for e in g2.edges:
s = g2.node_id_to_index[e[0]]
t = g2.node_id_to_index[e[1]]
plt.plot([node_embedding[s,0], node_embedding[t,0]], [node_embedding[s,1], node_embedding[t,1]],
color='lightsteelblue',
linestyle='-',
alpha=0.2,
lw=0.2)
plt.axis('off')
plt.show()
model.eval()
latent = model.higher_order_layers[1].forward(latent.cpu(), data.edge_index_higher_order).detach()
node_embedding = TSNE(n_components=2, learning_rate='auto', init='random').fit_transform(latent.cpu())
colors = []
for v, w in g2.nodes:
if data.y[v] == 0 and data.y[w] == 0:
colors.append('red')
elif data.y[v] == 1 and data.y[w] == 1:
colors.append('green')
elif data.y[v] == 2 and data.y[w] == 2:
colors.append('blue')
else:
colors.append('grey')
plt.figure(figsize=(13,10))
plt.scatter(node_embedding[:,0], node_embedding[:,1], c=colors, alpha=0.5)
for e in g2.edges:
s = g2.node_id_to_index[e[0]]
t = g2.node_id_to_index[e[1]]
plt.plot([node_embedding[s,0], node_embedding[t,0]], [node_embedding[s,1], node_embedding[t,1]],
color='lightsteelblue',
linestyle='-',
alpha=0.2,
lw=0.2)
plt.axis('off')
plt.show()
Latent space representation of nodes¶
In [ ]:
Copied!
model.eval()
latent = model.forward(data).detach()
node_embedding = TSNE(n_components=2, learning_rate='auto', init='random', perplexity=10).fit_transform(latent.cpu())
colors = []
for v in g.nodes:
if data.y[v] == 0:
colors.append('red')
elif data.y[v] == 1:
colors.append('green')
elif data.y[v] == 2:
colors.append('blue')
else:
colors.append('grey')
plt.figure(figsize=(13,10))
plt.scatter(node_embedding[:,0], node_embedding[:,1], c=colors, alpha=0.5)
for e in g.edges:
s = e[0]
t = e[1]
plt.plot([node_embedding[s,0], node_embedding[t,0]], [node_embedding[s,1], node_embedding[t,1]],
color='lightsteelblue',
linestyle='-',
alpha=0.2,
lw=0.2)
plt.axis('off')
plt.show()
model.eval()
latent = model.forward(data).detach()
node_embedding = TSNE(n_components=2, learning_rate='auto', init='random', perplexity=10).fit_transform(latent.cpu())
colors = []
for v in g.nodes:
if data.y[v] == 0:
colors.append('red')
elif data.y[v] == 1:
colors.append('green')
elif data.y[v] == 2:
colors.append('blue')
else:
colors.append('grey')
plt.figure(figsize=(13,10))
plt.scatter(node_embedding[:,0], node_embedding[:,1], c=colors, alpha=0.5)
for e in g.edges:
s = e[0]
t = e[1]
plt.plot([node_embedding[s,0], node_embedding[t,0]], [node_embedding[s,1], node_embedding[t,1]],
color='lightsteelblue',
linestyle='-',
alpha=0.2,
lw=0.2)
plt.axis('off')
plt.show()
In [ ]:
Copied!